import geopandas as gpd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import httpx
import rioxarray
import xvec
import folium
import ee
import geemap
import esda
import contextily
from libpysal import graph
import statsmodels.formula.api as smf
import mgwr
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BWEnvironmental and Spatial Predictors of Bird Diversity across the United States
Spatial Analysis of BBS Data
1 Introduction
One of the basic questions in biology asks the following: What is the origin of biodiversity? Why are there so many distinct species at the same site? These questions can be examined through evolutionary and ecological perspectives, which are often connected. The ecological approach is crucial, because it links the evolutionary history of coexisting species with the environment in which they occur and examines the factors which allow not just the build-up of local (and consequently regional) species richness, but also the ways it is maintained over time. One of the factors that facilitates species coexistence in birds and therefore drives their species richness (number of species at the site) on the regional scale is primary productivity of the environment (Pigot, Tobias, and Jetz 2016).
At broad spatial scales, species richness typically increases with productivity (Hawkins, Porter, and Diniz-Filho 2003). In contrast, at finer spatial scales, the relationship between productivity and species richness is more complex: positive, negative, hump-shaped, and U-shaped relationships have all been documented (Mittelbach et al. 2001). There are several hypotheses for this relationship between productivity and species richness, but we still do not know the ultimate mechanism behind it (Di Cecco et al. 2022). We will look at how the productivity of vegetation drives the species richness on the continental scale of North America. For this, I will use the Breeding Bird Survey (BBS) dataset, which is one of the most significant and longest-running “citizen science” programmes in the world. Established in 1966, its primary goal was to monitor the status and trends of North American bird populations on a continental scale. This census operates on the landscape scale as the routes are about 40 km long with the 50 stops where the volunteers count birds. As the measure of primary productivity and environmental heterogeneity of the vegetation, I decided to use the Normalised Difference Vegetation Index (NDVI) which is used as a proxy for primary productivity in both regional (Nieto, Flombaum, and Garbulsky 2015) and local studies (Brüggeshemke and Fartmann 2025).
- How vegetation density/heterogeneity predicts bird diversity across United States?
- How spatial predictors explain the patterns of species richness in North American birds?
- Prepare and explore the data - clean BBS routes, extract NDVI for routes geometry, and get elevation data with earth engine
- Evaluate global and local spatial autocorrelation in species richness and model residuals
- Build the OLS model and compare it with GWR model
2 Data preparation
2.1 Study area
I chose the lower 48 states of USA as the study area because for them the route geometries are publicly available. In the basic dataset of BBS, the geometry is included only for the starting point of the routes, which are on average about 40 km long and therefore the single point does not optimally represent their geometry. Firstly, we need to select the boundary for the study area.
us_states = gpd.read_file("study_area/cb_2018_us_state_500k.shp")
exclude = ["AK", "HI", "PR", "VI", "MP", "GU", "AS"] # States outside the area of interest
us_states = us_states.query("STUSPS not in @exclude") # Accessing the exclude variable outside the query() string
us_boundary = us_states.dissolve()2.2 BBS data
Now it is time to load the data about species richness, which consists of the number of species detected at the route. I decided to use data from 2018, which I already prepared in R, and computed species richness for each transect. This data has one geometry column with the coordinates of the starting point of the route. I downloaded the data at the official project site.
bbs = gpd.read_file("bbs/bbs_routes_SR_2018.gpkg", engine = "fiona")
bbs = bbs.loc[bbs["CountryNum"] == 840] # Select only US transects
bbs = bbs.loc[bbs["StateNum"] != 3] # Select every state except Alaska
bbs["RTENO"] = bbs["RTENO"].astype(str).str[3:] # Slice the string (skips the first 3 characters)Let’s load the data for the routes. Unfortunately, the route geometries are often inaccurate and sometimes consists of several fragments, which can cause problems. Therefore, we first need to remove potentially wrong route geometries. The expected route geometry ought to be about 40 km long, so let us filter potential outliers from this value. This cleaning will result in the loss of several hundred datapoints, but we need precise route geometries for data extraction.
routes = gpd.read_file("routes/bbsrtsl020.shp")
routes = routes.to_crs(epsg=5070) # First set the crs to USA Contiguous Albers Equal Area Conic projection
routes["calc_length_km"] = routes.geometry.length / 1000 # Calculating the route length for given geometry
routes = routes.loc[routes["RTELENG"] <= 45000,] # Selecting routes under 45 km in length by the dataset measure
mask = (routes["calc_length_km"] >= 35) & (routes["calc_length_km"] <= 45) # Filter out fragments and over-long routes
routes_filtered = routes[mask].copy()
print(f"Original routes count: {len(routes)}")
print(f"Filtered routes count (35-45km): {len(routes_filtered)}")
print(f"Average route length: {routes_filtered["calc_length_km"].mean():.2f} km")Original routes count: 3062
Filtered routes count (35-45km): 2570
Average route length: 40.14 km
Let us perform the spatial join of these two geodataframes. To combine BBS data with routes geometries we use spatial join using “inner” as we need matching values in both tables. I could also join the tables by column “RTENO”, but I want to be sure that the geometries match.
bbs = bbs.to_crs(routes_filtered.crs) # Make sure it's in the same coordinate system
bbs = bbs.drop(columns = "RTENO")
bbs["geometry"] = bbs.buffer(400) # Add a small buffer (400 m) to points to account for GPS inaccuracy
joined_data = gpd.sjoin(routes_filtered, bbs, how="inner", predicate="intersects") # Spatial join of bbs points data with the routes geometries
print(f"Routes in subset: {len(routes_filtered)}")
print(f"Routes with data: {len(joined_data["RTENO"].unique())}")Routes in subset: 2570
Routes with data: 1611
The large drop in the number of routes is due to missing BBS data for the route geometries, which could be due to annual differences in the census - in some years, particular routes are not counted due to weather conditions, terrain obstacles or illness of the observer. However, the number is still very large, so there are probably some other errors and inaccuracies in geometry matching. Now we will do the final preparation of the table for the NDVI and elevation values extraction. We need to get rid of redundant columns, create buffer around the routes, add the route midpoint and create Lat/Lon columns for external API.
# Add .copy() here to break the link to joined_data
bbs_routes = joined_data[["RTENO", "species_richness", "geometry"]].copy()
bbs_routes["buff_3000"] = bbs_routes.buffer(3000) # Creating 3 km buffer - buffer choice explained in NDVI extraction part
bbs_routes["route_midpoint"] = bbs_routes.geometry.interpolate(0.5, normalized=True) # Calculate the midpoint (50% along the line) as a reference geometry
bbs_routes = bbs_routes.drop(columns = "geometry") # We no longer need routes geometry
temp_gdf = bbs_routes.set_geometry("route_midpoint").to_crs(4326)
bbs_routes["lon"] = temp_gdf.geometry.x
bbs_routes["lat"] = temp_gdf.geometry.y
bbs_routes = bbs_routes.set_geometry("buff_3000")
bbs_routes.head()| RTENO | species_richness | buff_3000 | route_midpoint | lon | lat | |
|---|---|---|---|---|---|---|
| 0 | 2072 | 63 | POLYGON ((782767.664 1352804.909, 782613.87 13... | POINT (799058.576 1358112.615) | -87.165585 | 34.947661 |
| 2 | 2204 | 54 | POLYGON ((911518.247 1350136.581, 911484.644 1... | POINT (920348.359 1362929.663) | -85.828469 | 34.882272 |
| 3 | 2073 | 65 | POLYGON ((883221.105 1343004.45, 883424.384 13... | POINT (896739.418 1353510.697) | -86.098287 | 34.820791 |
| 4 | 2071 | 61 | POLYGON ((766029.847 1327436.538, 765484.087 1... | POINT (776778.876 1331128.227) | -87.437364 | 34.725655 |
| 5 | 2007 | 63 | POLYGON ((869910.569 1340151.781, 869882.814 1... | POINT (879587.76 1341112.337) | -86.300271 | 34.726327 |
2.3 NDVI raster
The normalised difference vegetation index (NDVI) indicates vegetation health within raster image pixels by quantifying how much near-infrared (NIR) light the plants reflect. Healthy vegetation reflects a higher proportion of NIR and a lower proportion of red light than stressed vegetation or non-living surfaces with similar visible colours (for example, turf fields). This contrast makes NDVI a practical tool for evaluating how healthy vegetation is in a raster image relative to its surroundings.
NDVI is calculated for each pixel with the following calculation:
\(\Large NDVI = \frac{(NIR - Red)}{(NIR + Red)}\)
This formula generates a value between -1 and +1. Low reflectance in the red channel and high reflectance in the NIR channel will yield a high NDVI value (healthy vegetation), while the inverse will result in a low NDVI value (unhealthy vegetation). Negative values typically represent non-vegetation such as water or rock.
The NDVI raster for the whole area would be very large and computationally demanding. Therefore, I decided to choose a different approach and work with a compressed 8-bit raster (values from 0 to 255) from NASA Earth Observations (NEO), which represents a 1-month average for June. The values contained in this file have been scaled and resampled to support visualization in NEO and are not suitable for rigorous scientific analysis. However, they may be used for basic assessments and for identifying general trends, which is exactly the goal of this study. We only need to resolve the different scaling of the values. We will extract the mean NDVI values for a 3 km buffer; these distances preserve higher predictive power for covariates in studies of breeding birds (Byer et al. 2025).
ndvi = rioxarray.open_rasterio("MOD_NDVI_M_2018-06-01_rgb_3600x1800.TIFF")
ndvi_us = ndvi.rio.clip(us_boundary.to_crs(ndvi.rio.crs).geometry) # Matching CRS and cliping to study area
ndvi_us = ndvi_us.drop_vars("band").squeeze() # Getting rid of redundant dimension
ndvi_us = ndvi_us.where(ndvi_us>0) # Filtering null values
print(f"Raster CRS: {ndvi_us.rio.crs}") # Check the CRS to see if it's in degrees (4326) or metersRaster CRS: EPSG:4326
_ = ndvi_us.plot()
We can see from the map that there is a very high NDVI on the east coast compared to the west of US. Now we can extract values from the raster for the routes buffer we created earlier. We will extract 1) the mean value of NDVI for each buffer and 2) the standard deviation of NDVI for each buffer. The latter will help estimate the heterogeneity of the vegetation.
ndvi_us = ndvi_us.rio.reproject(bbs_routes.crs) # Reprojecting to EPSG: 5070
ndvi_extraction = bbs_routes.to_crs(ndvi_us.rio.crs)
zonal_stats = ndvi_us.drop_vars("spatial_ref").xvec.zonal_stats(
geometry=ndvi_extraction.buff_3000,
x_coords="x",
y_coords="y",
stats=["mean", "std"],
)Let us check if there are some missing values in the data.
ndvi = zonal_stats.xvec.to_geodataframe(name="NDVI") # Tranforming zonal statistics into geodataframe
print(ndvi.isna().sum()) # Returns the count of NaNs for every column in the dataframegeometry 0
index 0
NDVI 66
dtype: int64
ndvi_clean = ndvi.dropna(subset=["NDVI"]) # Dropping rows where the NDVI value is missing
ndvi_stats = ndvi_clean.reset_index().pivot(
index="index",
columns="zonal_statistics",
values="NDVI"
) # This creates a table with index as rows and mean/std/min/max as columns
ndvi_stats.columns = ["ndvi_" + col for col in ndvi_stats.columns] # Add the "ndvi_" prefix to the new columns
geometries = ndvi_clean[["index", "geometry"]].drop_duplicates("index").set_index("index") # Get the unique geometries for each index
ndvi_data = geometries.join(ndvi_stats).reset_index() # Join them togetherAlthough the NDVI data was retrieved in 8-bit format (0–255), we can standardise it to Z-scores prior to modelling. This process centres the data and removes the original units, ensuring that the coefficients in the models are not biassed by the original bit-depth of the satellite imagery.
# Standardize the Mean (Relative Productivity)
# (Value - average of all routes) / standard deviation of all routes
ndvi_data["ndvi_mean_z"] = (ndvi_data["ndvi_mean"] - ndvi_data["ndvi_mean"].mean()) / ndvi_data["ndvi_mean"].std()
# Standardize the Std (Relative Heterogeneity)
# We treat the "std" as its own variable and standardize IT.
ndvi_data["ndvi_heterog_z"] = (ndvi_data["ndvi_std"] - ndvi_data["ndvi_std"].mean()) / ndvi_data["ndvi_std"].std()ndvi_to_join = ndvi_data[["ndvi_mean_z", "ndvi_heterog_z", "geometry"]]
buffer = bbs_routes.set_geometry("buff_3000")
# Perform the Spatial Join
# "predicate=inner" ensures the polygons must overlap/be the same
# "how=inner" keeps only rows that exist in both sets
bbs_ndvi = gpd.sjoin(
buffer,
ndvi_to_join,
how="inner",
predicate="intersects"
)
bbs_ndvi = bbs_ndvi.drop(columns="index_right") # Dropping index_right
data = bbs_ndvi.drop_duplicates(subset=["RTENO"]) # Check for duplicates
data.info()<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1592 entries, 0 to 3730
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 RTENO 1592 non-null int64
1 species_richness 1592 non-null int64
2 buff_3000 1592 non-null geometry
3 route_midpoint 1592 non-null geometry
4 lon 1592 non-null float64
5 lat 1592 non-null float64
6 ndvi_mean_z 1592 non-null float64
7 ndvi_heterog_z 1592 non-null float64
dtypes: float64(4), geometry(2), int64(2)
memory usage: 111.9 KB
m = data.explore(
column="ndvi_mean_z",
cmap="RdYlGn",
marker_kwds={'radius':5},
vmin=-2.5,
vmax=2.5,
legend=True,
tooltip=["ndvi_mean_z"], # Hover to see raw vs z-score
tiles="CartoDB positron",
popup=True # Click to see all data for that route
)
m